train <- read_csv("data/train.csv")
## Rows: 27674 Columns: 83
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (12): patient_race, payer_type, patient_state, breast_cancer_diagnosis_c...
## dbl (70): patient_id, patient_zip3, patient_age, bmi, breast_cancer_diagnosi...
## lgl  (1): patient_gender
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ncol(train)
nrow(train)
colnames(train)
skim(train)
set.seed(3911)
inTrain <- createDataPartition(y = train$treatment_pd, p=0.7, list=FALSE)
training <- train[inTrain,]
testing <- train[-inTrain,]
glimpse(training)
## Rows: 19,373
## Columns: 83
## $ patient_id                            <dbl> 994155, 154389, 921275, 235192, …
## $ patient_race                          <chr> "Asian", NA, "Hispanic", NA, "Wh…
## $ payer_type                            <chr> "COMMERCIAL", "MEDICARE ADVANTAG…
## $ patient_state                         <chr> "CA", "OH", "CA", "IN", "NM", "A…
## $ patient_zip3                          <dbl> 917, 451, 928, 462, 877, 356, 33…
## $ patient_age                           <dbl> 46, 63, 50, 35, 58, 37, 67, 45, …
## $ patient_gender                        <lgl> FALSE, FALSE, FALSE, FALSE, FALS…
## $ bmi                                   <dbl> 27.00, NA, NA, NA, NA, NA, NA, N…
## $ breast_cancer_diagnosis_code          <chr> "C50811", "C50412", "1749", "C50…
## $ breast_cancer_diagnosis_desc          <chr> "Malignant neoplasm of ovrlp sit…
## $ breast_cancer_diagnosis_year          <dbl> 2018, 2018, 2015, 2016, 2015, 20…
## $ metastatic_cancer_diagnosis_code      <chr> "C779", "C7951", "C787", "C773",…
## $ metastatic_first_treatment            <chr> "DOXORUBICIN HCL", "DOXORUBICIN …
## $ metastatic_first_treatment_type       <chr> NA, NA, NA, "Antineoplastics", "…
## $ metastatic_first_novel_treatment      <chr> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ metastatic_first_novel_treatment_type <chr> NA, NA, NA, NA, NA, NA, NA, NA, …
## $ region                                <chr> "West", "Midwest", "West", "Midw…
## $ division                              <chr> "Pacific", "East North Central",…
## $ population                            <dbl> 43031, 7228, 39122, 25675, 1266,…
## $ density                               <dbl> 2048.578261, 194.656250, 2295.93…
## $ age_median                            <dbl> 38.85217, 41.24783, 38.20000, 35…
## $ age_under_10                          <dbl> 11.306522, 12.855319, 11.878788,…
## $ age_10_to_19                          <dbl> 12.897826, 12.789362, 13.354545,…
## $ age_20s                               <dbl> 14.121739, 11.261702, 14.230303,…
## $ age_30s                               <dbl> 13.532609, 10.489362, 13.418182,…
## $ age_40s                               <dbl> 13.160870, 11.859574, 13.333333,…
## $ age_50s                               <dbl> 13.37826, 15.27872, 14.06061, 11…
## $ age_60s                               <dbl> 11.473913, 13.359574, 10.248485,…
## $ age_70s                               <dbl> 6.380435, 6.434043, 5.951515, 6.…
## $ age_over_80                           <dbl> 3.736957, 5.663830, 3.503030, 3.…
## $ male                                  <dbl> 49.05217, 52.09149, 49.89394, 48…
## $ female                                <dbl> 50.94783, 47.90851, 50.10606, 51…
## $ married                               <dbl> 48.50435, 50.67234, 50.24545, 39…
## $ divorced                              <dbl> 10.117391, 14.102128, 9.827273, …
## $ never_married                         <dbl> 36.40870, 27.11702, 35.29091, 41…
## $ widowed                               <dbl> 4.969565, 8.112766, 4.651515, 4.…
## $ family_size                           <dbl> 3.674783, 3.119565, 3.622727, 3.…
## $ family_dual_income                    <dbl> 59.21957, 51.22826, 61.73636, 54…
## $ income_household_median               <dbl> 86330.39, 65214.72, 102741.64, 6…
## $ income_household_under_5              <dbl> 2.226087, 2.329787, 2.327273, 4.…
## $ income_household_5_to_10              <dbl> 1.5282609, 3.2489362, 1.5363636,…
## $ income_household_10_to_15             <dbl> 2.897826, 4.468085, 2.648485, 3.…
## $ income_household_15_to_20             <dbl> 2.747826, 5.878723, 2.178788, 4.…
## $ income_household_20_to_25             <dbl> 3.173913, 5.323404, 2.409091, 5.…
## $ income_household_25_to_35             <dbl> 6.647826, 7.840426, 5.163636, 8.…
## $ income_household_35_to_50             <dbl> 9.617391, 12.246809, 7.972727, 1…
## $ income_household_50_to_75             <dbl> 15.96522, 20.04043, 13.93636, 17…
## $ income_household_75_to_100            <dbl> 13.589130, 14.051064, 12.469697,…
## $ income_household_100_to_150           <dbl> 19.75217, 15.67660, 19.76061, 13…
## $ income_household_150_over             <dbl> 21.847826, 8.902128, 29.596970, …
## $ income_household_six_figure           <dbl> 41.60000, 24.57872, 49.35758, 26…
## $ income_individual_median              <dbl> 34317.83, 32142.23, 41287.27, 36…
## $ home_ownership                        <dbl> 61.39783, 72.39149, 61.46364, 58…
## $ housing_units                         <dbl> 12609.2609, 2789.9583, 11725.666…
## $ home_value                            <dbl> 572606.5, 155901.8, 677688.5, 17…
## $ rent_median                           <dbl> 1778.0000, 828.0000, 2003.1250, …
## $ rent_burden                           <dbl> 34.59565, 26.51429, 34.75312, 31…
## $ education_less_highschool             <dbl> 17.491304, 15.829787, 14.230303,…
## $ education_highschool                  <dbl> 22.65652, 38.96809, 19.98788, 25…
## $ education_some_college                <dbl> 29.26304, 27.68298, 29.79697, 25…
## $ education_bachelors                   <dbl> 20.200000, 11.625532, 23.739394,…
## $ education_graduate                    <dbl> 10.404348, 5.887234, 12.245455, …
## $ education_college_or_above            <dbl> 30.60435, 17.51277, 35.98485, 37…
## $ education_stem_degree                 <dbl> 46.20870, 38.30889, 47.91818, 40…
## $ labor_force_participation             <dbl> 63.15435, 61.27234, 65.23030, 67…
## $ unemployment_rate                     <dbl> 6.197826, 5.793478, 5.103030, 5.…
## $ self_employed                         <dbl> 15.708696, 11.202500, 15.224242,…
## $ farmer                                <dbl> 0.015217391, 3.715000000, 0.0272…
## $ race_white                            <dbl> 38.70870, 96.05532, 54.03030, 61…
## $ race_black                            <dbl> 3.9630435, 1.0063830, 2.5272727,…
## $ race_asian                            <dbl> 25.5652174, 0.3212766, 20.827272…
## $ race_native                           <dbl> 1.19347826, 0.11702128, 0.587878…
## $ race_pacific                          <dbl> 0.269565217, 0.002127660, 0.3000…
## $ race_other                            <dbl> 18.8586957, 0.2553191, 11.645454…
## $ race_multiple                         <dbl> 11.426087, 2.234043, 10.081818, …
## $ hispanic                              <dbl> 47.726087, 1.182979, 37.948485, …
## $ disabled                              <dbl> 9.895652, 18.317021, 8.957576, 1…
## $ poverty                               <dbl> 10.515217, 13.546809, 10.109091,…
## $ limited_english                       <dbl> 12.7456522, 0.1468085, 8.0575758…
## $ commute_time                          <dbl> 32.53043, 31.89091, 30.60606, 23…
## $ health_uninsured                      <dbl> 7.263043, 7.631915, 7.018182, 8.…
## $ veteran                               <dbl> 3.810870, 9.631915, 4.103030, 6.…
## $ treatment_pd                          <dbl> 35, 33, 455, 75, 393, 62, 43, 13…
ncol(training)
## [1] 83
nrow(training)
## [1] 19373
colnames(training)
##  [1] "patient_id"                           
##  [2] "patient_race"                         
##  [3] "payer_type"                           
##  [4] "patient_state"                        
##  [5] "patient_zip3"                         
##  [6] "patient_age"                          
##  [7] "patient_gender"                       
##  [8] "bmi"                                  
##  [9] "breast_cancer_diagnosis_code"         
## [10] "breast_cancer_diagnosis_desc"         
## [11] "breast_cancer_diagnosis_year"         
## [12] "metastatic_cancer_diagnosis_code"     
## [13] "metastatic_first_treatment"           
## [14] "metastatic_first_treatment_type"      
## [15] "metastatic_first_novel_treatment"     
## [16] "metastatic_first_novel_treatment_type"
## [17] "region"                               
## [18] "division"                             
## [19] "population"                           
## [20] "density"                              
## [21] "age_median"                           
## [22] "age_under_10"                         
## [23] "age_10_to_19"                         
## [24] "age_20s"                              
## [25] "age_30s"                              
## [26] "age_40s"                              
## [27] "age_50s"                              
## [28] "age_60s"                              
## [29] "age_70s"                              
## [30] "age_over_80"                          
## [31] "male"                                 
## [32] "female"                               
## [33] "married"                              
## [34] "divorced"                             
## [35] "never_married"                        
## [36] "widowed"                              
## [37] "family_size"                          
## [38] "family_dual_income"                   
## [39] "income_household_median"              
## [40] "income_household_under_5"             
## [41] "income_household_5_to_10"             
## [42] "income_household_10_to_15"            
## [43] "income_household_15_to_20"            
## [44] "income_household_20_to_25"            
## [45] "income_household_25_to_35"            
## [46] "income_household_35_to_50"            
## [47] "income_household_50_to_75"            
## [48] "income_household_75_to_100"           
## [49] "income_household_100_to_150"          
## [50] "income_household_150_over"            
## [51] "income_household_six_figure"          
## [52] "income_individual_median"             
## [53] "home_ownership"                       
## [54] "housing_units"                        
## [55] "home_value"                           
## [56] "rent_median"                          
## [57] "rent_burden"                          
## [58] "education_less_highschool"            
## [59] "education_highschool"                 
## [60] "education_some_college"               
## [61] "education_bachelors"                  
## [62] "education_graduate"                   
## [63] "education_college_or_above"           
## [64] "education_stem_degree"                
## [65] "labor_force_participation"            
## [66] "unemployment_rate"                    
## [67] "self_employed"                        
## [68] "farmer"                               
## [69] "race_white"                           
## [70] "race_black"                           
## [71] "race_asian"                           
## [72] "race_native"                          
## [73] "race_pacific"                         
## [74] "race_other"                           
## [75] "race_multiple"                        
## [76] "hispanic"                             
## [77] "disabled"                             
## [78] "poverty"                              
## [79] "limited_english"                      
## [80] "commute_time"                         
## [81] "health_uninsured"                     
## [82] "veteran"                              
## [83] "treatment_pd"
skim(training)

Variable type: numeric

var n na mean sd p0 p25 p50 p75 p100
patient_id 19373 0 549376.82 260633.56 100051.00 323144.00 549366.00 776646.00 999996.00
patient_zip3 19373 0 523.98 292.01 6.00 296.00 490.00 785.00 996.00
patient_age 19373 0 54.43 11.20 19.00 47.00 55.00 62.00 91.00
bmi 19373 13243 29.33 5.65 14.00 25.00 28.88 33.00 97.00
breast_cancer_diagnosis_year 19373 0 2016.49 1.06 2015.00 2016.00 2016.00 2017.00 2018.00
population 19373 0 19954.11 13462.14 636.00 9155.00 18355.00 27842.00 71374.00
density 19373 0 1656.37 3651.62 0.82 169.32 627.90 1530.77 29851.69
age_median 19373 0 40.67 4.01 20.60 37.73 40.73 43.14 57.42
age_under_10 19373 0 11.12 1.54 0.00 10.15 11.04 12.19 17.68
age_10_to_19 19373 0 12.87 1.87 6.31 11.79 12.92 13.91 35.30
age_20s 19373 0 13.14 3.25 5.92 11.00 12.45 14.48 62.10
age_30s 19373 0 12.78 2.38 1.50 11.21 12.38 13.81 25.47
age_40s 19373 0 12.09 1.24 0.80 11.36 12.14 12.91 17.82
age_50s 19373 0 13.57 1.67 0.00 12.39 13.64 14.75 22.91
age_60s 19373 0 12.70 2.55 0.20 10.69 12.62 14.09 24.51
age_70s 19373 0 7.70 2.15 0.00 6.09 7.38 8.91 16.27
age_over_80 19373 0 4.02 1.24 0.00 3.29 3.82 4.56 18.82
male 19373 0 50.05 1.65 39.73 49.07 49.93 50.91 61.60
female 19373 0 49.95 1.65 38.40 49.09 50.07 50.93 60.27
married 19373 0 47.99 7.44 0.90 43.27 49.87 53.21 66.90
divorced 19373 0 12.65 2.09 0.20 11.07 12.65 14.19 21.03
never_married 19373 0 33.47 7.87 13.44 27.41 31.88 38.41 98.90
widowed 19373 0 5.89 1.55 0.00 4.77 5.60 6.67 20.65
family_size 19373 3 3.19 0.22 2.55 3.04 3.16 3.31 4.17
family_dual_income 19373 3 51.81 7.10 19.31 47.66 52.66 57.02 70.92
income_household_median 19373 3 75071.32 22100.49 18465.41 60923.48 70760.74 86319.88 164119.18
income_household_under_5 19373 3 3.29 1.66 0.71 2.18 2.83 4.02 19.62
income_household_5_to_10 19373 3 2.54 1.50 0.36 1.51 2.16 3.16 12.72
income_household_10_to_15 19373 3 4.14 1.87 0.65 2.81 3.71 5.09 14.42
income_household_15_to_20 19373 3 3.95 1.54 1.03 2.78 3.75 4.71 12.40
income_household_20_to_25 19373 3 4.08 1.41 0.84 3.04 4.00 4.88 14.35
income_household_25_to_35 19373 3 8.38 2.30 1.86 6.76 8.42 9.97 26.55
income_household_35_to_50 19373 3 11.47 2.66 1.70 9.69 11.71 13.29 24.08
income_household_50_to_75 19373 3 16.72 2.81 4.95 15.19 16.98 18.45 27.13
income_household_75_to_100 19373 3 12.63 1.97 3.10 11.36 12.67 13.91 24.80
income_household_100_to_150 19373 3 15.78 3.33 1.67 13.60 15.99 18.42 31.32
income_household_150_over 19373 3 17.02 9.45 0.84 9.97 14.74 21.73 52.82
income_household_six_figure 19373 3 32.80 11.72 3.44 24.32 30.66 40.91 69.03
income_individual_median 19373 0 36956.54 9149.77 4316.00 31232.79 35526.82 41287.27 88910.50
home_ownership 19373 3 66.61 14.20 15.85 56.90 71.33 77.42 90.37
housing_units 19373 0 7338.64 4850.21 0.00 3378.00 6653.95 10666.38 25922.55
home_value 19373 3 324338.37 238712.74 60629.00 163904.33 242866.07 378014.35 1853109.20
rent_median 19373 3 1225.03 423.92 448.40 891.74 1164.49 1467.65 2965.25
rent_burden 19373 3 31.28 4.81 16.69 28.34 30.83 33.63 78.94
education_less_highschool 19373 0 11.85 5.20 0.00 7.89 10.71 15.02 34.33
education_highschool 19373 0 27.99 7.96 0.00 22.50 27.60 33.60 53.84
education_some_college 19373 0 28.49 5.10 7.20 25.22 29.06 32.19 50.13
education_bachelors 19373 0 19.30 6.44 2.47 13.92 18.83 24.14 41.70
education_graduate 19373 0 12.37 6.21 2.09 7.49 10.80 16.11 51.84
education_college_or_above 19373 0 31.67 12.22 7.05 21.63 29.99 40.66 77.82
education_stem_degree 19373 0 43.25 4.58 23.91 40.08 43.02 45.81 73.00
labor_force_participation 19373 0 61.65 6.13 30.70 57.93 62.82 65.91 78.67
unemployment_rate 19373 0 5.97 2.03 0.99 4.76 5.47 6.74 18.80
self_employed 19373 1501 13.01 3.29 2.26 10.73 12.55 14.95 26.62
farmer 19373 1501 2.04 3.27 0.00 0.05 0.62 2.69 27.54
race_white 19373 0 70.26 17.77 14.50 57.69 70.99 85.58 98.44
race_black 19373 0 11.71 12.69 0.04 2.41 6.53 17.12 69.66
race_asian 19373 0 5.19 6.44 0.00 1.05 2.70 6.63 49.85
race_native 19373 0 0.86 2.51 0.00 0.20 0.34 0.72 76.93
race_pacific 19373 0 0.12 0.46 0.00 0.02 0.05 0.12 14.76
race_other 19373 0 5.38 5.97 0.00 1.31 3.36 7.63 33.19
race_multiple 19373 0 6.49 3.55 0.43 3.89 5.58 8.69 26.43
hispanic 19373 0 17.95 17.59 0.06 4.64 11.70 24.63 99.10
disabled 19373 0 13.44 3.82 4.60 10.27 13.07 15.61 35.16
poverty 19373 3 13.34 5.75 3.43 9.34 12.12 16.43 48.93
limited_english 19373 3 4.63 6.62 0.00 0.98 2.67 5.96 74.07
commute_time 19373 0 28.06 4.88 12.46 24.97 27.81 30.92 48.02
health_uninsured 19373 0 8.57 4.30 1.17 5.50 7.44 10.69 27.57
veteran 19373 0 7.08 3.11 1.20 4.97 6.88 8.65 25.20
treatment_pd 19373 0 134.52 188.25 0.00 36.00 70.00 133.00 1446.00
# Rows = 19373
training %>%
  select(everything()) %>%
  summarise_all(funs(sum(is.na(.))))
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## ℹ Please use a list of either functions or lambdas:
## 
## # Simple named list: list(mean = mean, median = median)
## 
## # Auto named with `tibble::lst()`: tibble::lst(mean, median)
## 
## # Using lambdas list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## # A tibble: 1 × 83
##   patient_id patient_race payer_type patient_state patient_zip3 patient_age
##        <int>        <int>      <int>         <int>        <int>       <int>
## 1          0        10614       1982            52            0           0
## # ℹ 77 more variables: patient_gender <int>, bmi <int>,
## #   breast_cancer_diagnosis_code <int>, breast_cancer_diagnosis_desc <int>,
## #   breast_cancer_diagnosis_year <int>, metastatic_cancer_diagnosis_code <int>,
## #   metastatic_first_treatment <int>, metastatic_first_treatment_type <int>,
## #   metastatic_first_novel_treatment <int>,
## #   metastatic_first_novel_treatment_type <int>, region <int>, division <int>,
## #   population <int>, density <int>, age_median <int>, age_under_10 <int>, …
training %>%
  na.omit() %>%
  summarize(across(everything(), min))

training %>%
  na.omit() %>%
  summarize(across(everything(), max))

training %>%
  select(patient_age) %>%
  arrange((patient_age))

training %>%
  count(patient_gender)

training %>%
  count(patient_state) %>%
  arrange(desc(n))

training %>% 
  select(metastatic_first_novel_treatment) %>%
  summary(NA) # 13243 NA's

training %>%
  count(metastatic_first_novel_treatment)

training %>% 
  select(metastatic_first_novel_treatment_type) %>%
  summary(NA)
training %>%
  count(metastatic_first_novel_treatment_type)

training %>% 
  select(bmi) %>%
  summary(NA)

training %>%
  count(bmi)

training %>% 
  select(patient_race) %>%
  summary(NA)

training %>%
  count(patient_race)

training %>% 
  select(metastatic_first_novel_treatment_type) %>%
  summary(NA)

training %>%
  count(metastatic_first_treatment_type)

training %>%
  count(breast_cancer_diagnosis_desc) %>%
  arrange(desc(n))

training %>%
  count(metastatic_cancer_diagnosis_code)
# removed variables with too many missing values
training <- training %>%
  mutate(metastatic_first_novel_treatment = NULL,
         metastatic_first_novel_treatment_type = NULL,
         patient_race = NULL,
         payer_type = NULL,
         bmi = NULL,
         metastatic_first_treatment_type = NULL,
         self_employed = NULL,
         farmer = NULL)
training %>%
  count(breast_cancer_diagnosis_code) %>%
  arrange(desc(n))

training %>%
  count(metastatic_cancer_diagnosis_code) %>%
  arrange(desc(n))

training %>%
  count(metastatic_first_treatment) 
finding_side <- training %>%
  select(breast_cancer_diagnosis_desc)

finding_side <- deframe(finding_side)

left_side <- str_extract(finding_side, "left")

any <- str_extract(finding_side, "left")

enframe(left_side)

right_side <- str_extract(finding_side, "right")

enframe(right_side)

unspecified_side <- str_extract(finding_side, "unspecified")

enframe(unspecified_side)
finding_side <- training %>%
  select(breast_cancer_diagnosis_desc)

finding_side <- deframe(finding_side)

training <- training %>%
  mutate(breast_cancer_diagnosis_side = case_when(
    str_detect(finding_side, "left") ~ "left",
    str_detect(finding_side, "right") ~ "right",
    TRUE ~ "unspecified"
  ))

training %>%
  count(breast_cancer_diagnosis_side)
## # A tibble: 3 × 2
##   breast_cancer_diagnosis_side     n
##   <chr>                        <int>
## 1 left                          7394
## 2 right                         7167
## 3 unspecified                   4812
# left = 7394
# right = 7167
# unspecified = 4812
training %>%
  count(metastatic_first_treatment)
## # A tibble: 37 × 2
##    metastatic_first_treatment     n
##    <chr>                      <int>
##  1 BEVACIZUMAB                  154
##  2 BLEOMYCIN SULFATE              1
##  3 CAPECITABINE                1121
##  4 CARBOPLATIN                 3158
##  5 CISPLATIN                     78
##  6 CYCLOPHOSPHAMIDE            2920
##  7 DOCETAXEL                    817
##  8 DOCETAXEL ANHYDROUS           83
##  9 DOXORUBICIN HCL             6895
## 10 DOXORUBICIN HCL LIPOSOMAL    123
## # ℹ 27 more rows
training <- training %>%
  mutate(metastatic_first_treatment = case_when(
    metastatic_first_treatment == "DOCETAXEL ANHYDROUS" ~ "DOCETAXEL",
    
    metastatic_first_treatment == "DOXORUBICIN HCL LIPOSOMAL" ~ "DOXORUBICIN HYDROCHLORIDE",
    metastatic_first_treatment == "DOXORUBICIN HCL" ~ "DOXORUBICIN HYDROCHLORIDE",
    
    metastatic_first_treatment == "EPIRUBICIN HCL" ~ "EPIRUBICIN HYDROCHLORIDE",
    
    metastatic_first_treatment == "GEMCITABINE HCL" ~ "GEMCITABINE HYDROCHLORIDE",
    
    metastatic_first_treatment == "METHOTREXATE" ~ "METHOTREXATE SODIUM",
    
    metastatic_first_treatment == "PACLITAXEL PROTEIN BOUND PARTICLES" ~ "PACLITAXEL",
    
    metastatic_first_treatment == "PEMETREXED DISODIUM HEPTAHYDRATE" ~ "PEMETREXED DISODIUM",
    
    .default = metastatic_first_treatment
  ))
training <- training %>%
  mutate(metastatic_cancer_body_system = case_when(
    metastatic_cancer_diagnosis_code %in% c("C770", "C771", "C772", "C773", "C774", "C775", "C778", "C779") ~ "lymphatic",
    metastatic_cancer_diagnosis_code %in% c("C7800", "C7801", "C7802", "C781", "C782", "C7839") ~ "respiratory",
    metastatic_cancer_diagnosis_code %in% c("C784", "C785", "C786", "C787", "C7889") ~ "digestive",
    metastatic_cancer_diagnosis_code %in% c("C7900", "C7901", "C7902", "C7910") ~ "urinary",
    metastatic_cancer_diagnosis_code %in% c("C792", "C7931", "C7932", "C7940", "C7949") ~ "nervous",
    metastatic_cancer_diagnosis_code %in% c("C7951", "C7952") ~ "skeletal",
    metastatic_cancer_diagnosis_code %in% c("C7960", "C7961", "C7962", "C7981", "C7982") ~ "reproductive",
    metastatic_cancer_diagnosis_code %in% c("C7970", "C7971", "C7972") ~ "adrenal glands",
    metastatic_cancer_diagnosis_code %in% c("C7989", "C799") ~ "other",
  ))

training %>%
  count(metastatic_cancer_body_system) 
## # A tibble: 9 × 2
##   metastatic_cancer_body_system     n
##   <chr>                         <int>
## 1 adrenal glands                   10
## 2 digestive                       753
## 3 lymphatic                     14347
## 4 nervous                         390
## 5 other                           777
## 6 reproductive                    400
## 7 respiratory                     751
## 8 skeletal                       1938
## 9 urinary                           7
library(mice)
imputeData <- training %>%
  select(treatment_pd, family_size, family_dual_income, income_household_median, income_household_under_5, income_household_5_to_10, income_household_10_to_15, income_household_15_to_20, income_household_20_to_25, income_household_25_to_35, income_household_35_to_50,
         income_household_50_to_75, income_household_75_to_100, income_household_100_to_150, income_household_150_over, income_household_six_figure,
         home_ownership, home_value, rent_median, rent_burden) 
md.pattern(imputeData)
library(VIM)
aggr_plot <- aggr(imputeData, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(imputeData), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))
tempImputeData <- mice(imputeData, m=5, meth='pmm', seed=500)
summary(tempImputeData)
completeImputed <- complete(tempImputeData)
sapply(completeImputed, function(x) sum(is.na(x)))
# treatment pd -- very right skewed
training %>%
  ggplot(mapping = aes(x = treatment_pd)) +
          geom_boxplot()

training %>%
  filter(treatment_pd < 500) %>%
  ggplot(mapping = aes(x = treatment_pd)) +
          geom_boxplot() +
          labs(x = "Treatment Period (days)",
               title = "Distribution of Period Between Diagnosis and Treatment")

training %>%
  filter(treatment_pd < 200) %>%
  ggplot(mapping = aes(x = treatment_pd)) +
          geom_boxplot()

# region
training %>%
  ggplot(mapping = aes(x = region)) +
          geom_bar()

# divisiom
training %>%
  ggplot(mapping = aes(y = division)) +
          geom_bar()

# patient age -- pretty normal
training %>%
  ggplot(mapping = aes(x = patient_age)) +
          geom_boxplot() +
          labs(x = "Age (Years)",
               title = "Distribution of Patient Age")

# density -- very right skewed
training %>%
  filter(density <= 10000) %>%
  ggplot(mapping = aes(x = density)) +
          geom_boxplot()

# female -- normal
training %>%
  ggplot(mapping = aes(x = female)) +
          geom_boxplot()

# health uninsured -- little right skewed
training %>%
  ggplot(mapping = aes(x = health_uninsured)) +
          geom_boxplot()

# race white -- little left skewed
training %>%
  ggplot(mapping = aes(x = race_white)) +
          geom_boxplot()

# limited english -- right skewed
training %>%
  filter(limited_english < 40) %>%
  ggplot(mapping = aes(x = limited_english)) +
          geom_boxplot()

# metastatic cancer body system
training %>%
  ggplot(mapping = aes(x = metastatic_cancer_body_system)) +
          geom_bar() +
          labs(x = "Body System",
               title = "Distribution of Body System with Metastatic Cancer")

# metastatic first treatment
training %>%
  count(metastatic_first_treatment) %>%
  filter(n > 1000)
## # A tibble: 5 × 2
##   metastatic_first_treatment     n
##   <chr>                      <int>
## 1 CAPECITABINE                1121
## 2 CARBOPLATIN                 3158
## 3 CYCLOPHOSPHAMIDE            2920
## 4 DOXORUBICIN HYDROCHLORIDE   7200
## 5 PACLITAXEL                  2494
training %>%
  select(metastatic_first_treatment) %>%
  filter(metastatic_first_treatment %in% c('CAPECITABINE','CARBOPLATIN','CYCLOPHOSPHAMIDE','DOXORUBICIN HYDROCHLORIDE', 'PACLITAXEL'))%>% 
  ggplot(mapping = aes(x = metastatic_first_treatment)) +
          geom_bar() +
          labs(x = "Treatment Drug",
               y = "Number of Patients",
               title = "Top 5 Metastatic Cancer Treatment Drugs")

# income household median
training %>%
  ggplot(mapping = aes(x = income_household_median)) +
          geom_boxplot()
## Warning: Removed 3 rows containing non-finite values (`stat_boxplot()`).

# home ownership -- left skewed
training %>%
  ggplot(mapping = aes(x = home_ownership)) +
          geom_boxplot()
## Warning: Removed 3 rows containing non-finite values (`stat_boxplot()`).

# rent burden -- somewhat normal, a little right skewed
training %>%
  filter(rent_burden < 70) %>%
  ggplot(mapping = aes(x = rent_burden)) +
          geom_boxplot()

# education high school -- normal
training %>%
  ggplot(mapping = aes(x = education_highschool)) +
          geom_boxplot()

# unemployment rate -- right skewed
training %>%
  ggplot(mapping = aes(x = unemployment_rate)) +
          geom_boxplot()

# poverty -- right skewed
training %>%
  ggplot(mapping = aes(x = poverty)) +
          geom_boxplot()
## Warning: Removed 3 rows containing non-finite values (`stat_boxplot()`).

# colored by region

# patient age
ggplot(data = training, mapping = aes(x = patient_age, y = treatment_pd, color=region)) +
           geom_point()

# median income household for zip code
# more south in lower income median
ggplot(data = training, mapping = aes(x = income_household_median, y = treatment_pd, color=region)) +
           geom_point()
## Warning: Removed 3 rows containing missing values (`geom_point()`).

# population for zip code
# outliers are in northeast, more midwest in lower population 
ggplot(data = training, mapping = aes(x = population, y = treatment_pd, color=region)) +
           geom_point()

# median age for zip code
ggplot(data = training, mapping = aes(x = age_median, y = treatment_pd, color=region)) +
           geom_point()

# unemployement rate for zip code
# 
ggplot(data = training, mapping = aes(x = unemployment_rate, y = treatment_pd, color=region)) +
           geom_point()

# patient age for zip code 
ggplot(data = training, mapping = aes(x = patient_age, y = treatment_pd, color=region)) +
           geom_point()

# all of them -- unspecified more with higher treatment period

# patient age
ggplot(data = training, mapping = aes(x = patient_age, y = treatment_pd, color= breast_cancer_diagnosis_side)) +
           geom_point()

# median income household for zip code
ggplot(data = training, mapping = aes(x = income_household_median, y = treatment_pd, color= breast_cancer_diagnosis_side)) +
           geom_point()
## Warning: Removed 3 rows containing missing values (`geom_point()`).

# population for zip code
ggplot(data = training, mapping = aes(x = population, y = treatment_pd, color= breast_cancer_diagnosis_side)) +
           geom_point()

# median age for zip code
ggplot(data = training, mapping = aes(x = age_median, y = treatment_pd, color= breast_cancer_diagnosis_side)) +
           geom_point()

# unemployement rate for zip code
ggplot(data = training, mapping = aes(x = unemployment_rate, y = treatment_pd, color= breast_cancer_diagnosis_side)) +
           geom_point()

# patient age for zip code 
ggplot(data = training, mapping = aes(x = patient_age, y = treatment_pd, color= breast_cancer_diagnosis_side)) +
           geom_point()

# all -- lymphatic areas are more around shorter treatment periods

# patient age
ggplot(data = training, mapping = aes(x = patient_age, y = treatment_pd, color= metastatic_cancer_body_system)) +
           geom_point()

# median income household for zip code
ggplot(data = training, mapping = aes(x = income_household_median, y = treatment_pd, color= metastatic_cancer_body_system)) +
           geom_point()
## Warning: Removed 3 rows containing missing values (`geom_point()`).

# population for zip code
ggplot(data = training, mapping = aes(x = population, y = treatment_pd, color= metastatic_cancer_body_system)) +
           geom_point()

# median age for zip code
ggplot(data = training, mapping = aes(x = age_median, y = treatment_pd, color= metastatic_cancer_body_system)) +
           geom_point()

# unemployement rate for zip code
ggplot(data = training, mapping = aes(x = unemployment_rate, y = treatment_pd, color= metastatic_cancer_body_system)) +
           geom_point()

# patient age for zip code 
ggplot(data = training, mapping = aes(x = patient_age, y = treatment_pd, color= metastatic_cancer_body_system)) +
           geom_point()

training %>%
  filter(treatment_pd < 1000) %>%
  ggplot(mapping = aes(y = metastatic_cancer_body_system, x = treatment_pd)) +
  geom_boxplot() 

training %>%
  select(metastatic_first_treatment, treatment_pd) %>%
  filter(metastatic_first_treatment %in% c('CAPECITABINE','CARBOPLATIN','CYCLOPHOSPHAMIDE','DOXORUBICIN HYDROCHLORIDE', 'PACLITAXEL')) %>%
  ggplot(mapping = aes(y = metastatic_first_treatment, x = treatment_pd)) +
          geom_boxplot()

training %>% 
  select() %>%
  ggpairs()